library(data.table)
library(Rtsne)
library(R.utils)
library(umap)
library(ggplot2)
theme_set(theme_bw())

Processing data

Single cell sequencing data for all cells

fnm1 = "GSE120575_Sade_Feldman_melanoma_single_cells_nolabel_1.txt.gz"
fnm2 = "GSE120575_Sade_Feldman_melanoma_single_cells_2.txt.gz"

sf_tpm1 = fread(paste0("../../scRNAseq/Sade_Feldman_2018/", fnm1), 
                fill = TRUE, header = TRUE)

sf_tpm2 = fread(paste0("../../scRNAseq/Sade_Feldman_2018/", fnm2), 
                fill = TRUE, drop = 16293, col.names = colnames(sf_tpm1))

ls_tpm = list(sf_tpm1,sf_tpm2)
sf_tpm = rbindlist(ls_tpm)

rm(ls_tpm)
rm(sf_tpm1)
rm(sf_tpm2)
dim(sf_tpm) # 55737 x 16292
## [1] 55737 16292
sf_tpm[1:2,1:5]
cell.names = colnames(sf_tpm)[-1]
gene.names = sf_tpm$V1
rownames(sf_tpm) = gene.names
sf_tpm$V1 = NULL

dim(sf_tpm)
## [1] 55737 16291
sf_tpm[1:2,1:5]

Table of CD8+ T cells

xnm = "1-s2.0-S0092867418313941-mmc2.xlsx"
table_s2 = readxl::read_xlsx(paste0("../../_reference/Sade_Feldman_2018/", xnm), 
                             sheet = "Cluster annotation-Fig2A-B", col_names = TRUE)
dim(table_s2)
## [1] 6350    2
table_s2[1:2,]
table_s2 = as.data.frame(table_s2)

table(table_s2$`Cell Name` %in% cell.names)
## 
## TRUE 
## 6350
table(table_s2$Cluster)
## 
## CD8_B CD8_G 
##  3023  3327

Data for only CD8+ T cells

CD8T = table_s2$`Cell Name`
sf_CD8T_tpm = sf_tpm[ , ..CD8T]
rownames(sf_CD8T_tpm) = gene.names
rm(sf_tpm)

How many cells each gene is expressed

gene.grouping = rep(1:28, each = 2000)[1:dim(sf_CD8T_tpm)[1]]
cells.per.gene = NULL
for (i in 1:28) {
  temp.counts = apply(sf_CD8T_tpm[gene.grouping==i,], 1, FUN = function(x) sum(x>0))
  cells.per.gene = c(cells.per.gene,temp.counts)
}

table(cells.per.gene == 0)
## 
## FALSE  TRUE 
## 46790  8947
table(cells.per.gene == 1)
## 
## FALSE  TRUE 
## 51118  4619
hist(log10(cells.per.gene), main = "CD8 T cells (CD8_B + CD8_G)", 
     xlab = "log10(number of cells) each gene expressed", breaks=50)

rm(gene.grouping)

Choose high-variance genes

plot the variance of the expression levels for each gene against its mean expression level

# kept 11662 genes
large.express.genes = gene.names[which(cells.per.gene > 200)]
large.expressed.tpm = sf_CD8T_tpm[match(large.express.genes,rownames(sf_CD8T_tpm)),]
dim(large.expressed.tpm)
## [1] 11662  6350
df_mv = data.frame(gene=large.express.genes, mean=rowMeans(large.expressed.tpm),
                   var=apply(large.expressed.tpm, 1, var))
rm(large.express.genes)
rm(large.expressed.tpm)

ggplot(data = df_mv) + 
  geom_point(aes(x=mean,y=var),size = 0.1)

divide the genes into 20 equal-size bins based on their mean expression levels, and select the first 50 genes with the highest variance from each bin.

df_mv$grouping = cut_number(df_mv$mean, n=20)
table(df_mv$grouping)
## 
## [0.0214,0.157]   (0.157,0.21]   (0.21,0.252]  (0.252,0.295]  (0.295,0.344] 
##            584            583            583            583            583 
##  (0.344,0.397]  (0.397,0.457]  (0.457,0.525]  (0.525,0.602]  (0.602,0.685] 
##            583            583            583            583            583 
##  (0.685,0.792]  (0.792,0.909]   (0.909,1.04]     (1.04,1.2]     (1.2,1.41] 
##            583            583            583            583            583 
##    (1.41,1.66]    (1.66,2.04]    (2.04,2.66]    (2.66,3.91]    (3.91,16.4] 
##            583            583            583            583            584
hv.genes.list = by(df_mv, df_mv$grouping, 
                   FUN = function(df) return(df$gene[order(df$var,decreasing = TRUE)][1:50]))
hv.genes = as.character(unlist(hv.genes.list))
rm(hv.genes.list)

hvg_tpm = sf_CD8T_tpm[match(hv.genes,rownames(sf_CD8T_tpm)),]
hvg_tpm = t(hvg_tpm)
colnames(hvg_tpm) = hv.genes
rm(df_mv)

Perform PCA on selected genes

hvg_tpm_log = log(hvg_tpm + 1)
pca.hvg = princomp(hvg_tpm_log, cor = TRUE)

screen plot

eigvals = pca.hvg$sdev^2
pve = eigvals/sum(eigvals)
plot(1:50, pve[1:50], main = "scree plot (first 50 PCs)", type="b", 
     xlab="i-th PC",ylab="Proportion of Variance Explained")

rm(eigvals)

plot of cumulative proportion of variance explained

yvec = cumsum(pve)
plot(yvec, type="l", main = "plot of cumulative proportion of variance",  
     xlab = "i-th Principal Components", 
     ylab = "Cumulative variance", ylim=c(0,1))
abline(h=0.8, lty = 2)

plot(yvec[1:100], type="l", main = "plot of cumulative proportion of variance",  
     xlab = "i-th Principal Components", 
     ylab = "Cumulative variance")

pca.scores = pca.hvg$scores
rm(pca.hvg)

Clustering on top 50 PCs

top50pcs = pca.scores[,1:50]
rm(pca.scores)

Calculate TSNE

set.seed(9999)
tsne = Rtsne(top50pcs, pca=FALSE)
df_tsne = as.data.frame(tsne$Y)
names(df_tsne) = paste0("topPC_TSNE",seq(ncol(tsne$Y)))
rm(tsne)

Calculate UMAP

pcs_umap = umap(top50pcs)
df_umap = as.data.frame(pcs_umap$layout)
names(df_umap) = paste0("topPC_UMAP",seq(ncol(df_umap)))
rm(pcs_umap)

K-means clustering on top 50 PCs

Prepare dataframe for k-means clustering using top 50 PCs

df_top50PCs = cbind(top50pcs, df_tsne)
df_top50PCs = cbind(df_top50PCs, df_umap)

rm(df_tsne)
rm(df_umap)

dim(df_top50PCs)
## [1] 6350   54
df_top50PCs[1:2, c(1:2,40:52)]
rm(top50pcs)

Try Kmeans for 2 to 8 clusters

set.seed(9999)
all_num_clust = c(2:8)

for (num_clust in all_num_clust) {
  cat(paste0("KM with ",num_clust," clusters.\n"))
  kmeans_out = kmeans(df_top50PCs[,1:50], centers = num_clust, 
                      iter.max = 1e3, nstart = 50, algorithm = "MacQueen")
  print(kmeans_out[c("betweenss","tot.withinss")])
  km_label = paste0("KM_",num_clust)
  df_top50PCs[[km_label]] = as.factor(kmeans_out$cluster)
  
  pt1 = ggplot(data=df_top50PCs) + 
    geom_point(aes(x=topPC_TSNE1,y=topPC_TSNE2,color=eval(as.name(paste(km_label)))),
               size = 0.3) + 
    scale_colour_discrete(name=paste(km_label)) + 
    guides(color = guide_legend(override.aes = list(size = 2)))
  
  pt2 = ggplot(data=df_top50PCs) + 
    geom_point(aes(x=topPC_UMAP1,y=topPC_UMAP2,color=eval(as.name(paste(km_label)))),
               size = 0.3) + 
    scale_colour_discrete(name=paste(km_label)) + 
    guides(color = guide_legend(override.aes = list(size = 2)))
    
  print(pt1)
  print(pt2)

  rm(kmeans_out)
  rm(pt1)
  rm(pt2)
}
## KM with 2 clusters.
## $betweenss
## [1] 184844.6
## 
## $tot.withinss
## [1] 831430.6

## KM with 3 clusters.
## $betweenss
## [1] 254609.9
## 
## $tot.withinss
## [1] 761665.3

## KM with 4 clusters.
## $betweenss
## [1] 276162.9
## 
## $tot.withinss
## [1] 740112.4

## KM with 5 clusters.
## $betweenss
## [1] 293896.1
## 
## $tot.withinss
## [1] 722379.1

## KM with 6 clusters.
## $betweenss
## [1] 310972.3
## 
## $tot.withinss
## [1] 705303

## KM with 7 clusters.
## $betweenss
## [1] 323238.4
## 
## $tot.withinss
## [1] 693036.9

## KM with 8 clusters.
## $betweenss
## [1] 332961.8
## 
## $tot.withinss
## [1] 683313.5

Compared to the clustering result from Sade-Feldman et al., 2018

# CD8_G cluster and CD8_B cluster label for each CD8+ T cell
df_top50PCs$CD8T = as.factor(table_s2$Cluster)

# 6 CD8+ T cell clusters in Figure 4
xnm = "1-s2.0-S0092867418313941-mmc4.xlsx"
table_s4 = readxl::read_xlsx(paste0("../../_reference/Sade_Feldman_2018/", xnm), 
                             sheet = "Cluster annotation-Fig4A-B", col_names = TRUE)
dim(table_s4)
## [1] 6350    2
table_s4[1:2,]
table(table_s4$Cluster)
## 
## CD8_1 CD8_2 CD8_3 CD8_4 CD8_5 CD8_6 
##   601   856  1744   980   808  1361
table(rownames(df_top50PCs) == table_s4$`Cell Name`)
## 
## TRUE 
## 6350
df_top50PCs$sf_cluster = as.factor(table_s4$Cluster)

CD8_G cluster and CD8_B cluster annotation from Sade-Feldman et al., 2018

ggplot(data=df_top50PCs) + 
  geom_point(aes(x=topPC_TSNE1,y=topPC_TSNE2,color=CD8T),size = 0.3) + 
  guides(color = guide_legend(override.aes = list(size = 2)))

ggplot(data=df_top50PCs) + 
  geom_point(aes(x=topPC_UMAP1,y=topPC_UMAP2,color=CD8T),size = 0.3) + 
  guides(color = guide_legend(override.aes = list(size = 2)))

In the K-means analysis result using 3 clusters, the cluster of CD8_B cells and that of CD8_D cells could be roughly identified. Cluster 1 contains all CD8_B cells, cluster 2 contains mostly CD8_B cells, and cluster 3 contains mostly CD8_G cells.

ggplot(data=df_top50PCs) + 
  geom_point(aes(x=topPC_UMAP1,y=topPC_UMAP2,color=KM_3),size = 0.3) + 
  guides(color = guide_legend(override.aes = list(size = 2)))

tb1 = table(df_top50PCs$KM_3, df_top50PCs$CD8T)
tb1
##    
##     CD8_B CD8_G
##   1   672  3046
##   2   425     0
##   3  1926   281
CD8B_prop = tb1[,"CD8_B"]/rowSums(tb1)
CD8G_prop = tb1[,"CD8_G"]/rowSums(tb1)

CD8B_prop
##         1         2         3 
## 0.1807423 1.0000000 0.8726778
CD8G_prop
##         1         2         3 
## 0.8192577 0.0000000 0.1273222
CD8B_cluster = which(CD8B_prop > 0.8)
CD8G_cluster = which(CD8G_prop > 0.8)

table_s2$ruoyi_cluster = rep(NA, nrow(table_s2))
table_s2$ruoyi_cluster[df_top50PCs$KM_3 %in% CD8B_cluster & df_top50PCs$CD8T == "CD8_B"] = "CD8_B"
table_s2$ruoyi_cluster[df_top50PCs$KM_3 %in% CD8G_cluster & df_top50PCs$CD8T == "CD8_G"] = "CD8_G"
table(table_s2$ruoyi_cluster, table_s2$Cluster, useNA = "ifany")
##        
##         CD8_B CD8_G
##   CD8_B  2351     0
##   CD8_G     0  3046
##   <NA>    672   281
write.csv(table_s2, "output/CD8_cluster.csv", quote = FALSE, row.names = FALSE)

Fine clustering of CD8+ T cells in Sade-Feldman et al., 2018

ggplot(data=df_top50PCs) + 
  geom_point(aes(x=topPC_UMAP1,y=topPC_UMAP2,color=sf_cluster),size = 0.3) + 
  guides(color = guide_legend(override.aes = list(size = 2)))

K-means analysis using 8 clusters have some overlap with the CD8+ T cells clusters defined by the paper.

ggplot(data=df_top50PCs) + 
  geom_point(aes(x=topPC_UMAP1,y=topPC_UMAP2,color=KM_8),size = 0.3) + 
  guides(color = guide_legend(override.aes = list(size = 2)))

table(df_top50PCs$KM_8, df_top50PCs$sf_cluster)
##    
##     CD8_1 CD8_2 CD8_3 CD8_4 CD8_5 CD8_6
##   1     5     4     9   365    19   604
##   2    13   435   888    77   121   151
##   3    55   273   650    20     3    51
##   4   145     0     0     0     0     0
##   5   216    15    19     1     0     0
##   6     0   106   103   414   651   351
##   7   167    17    13     2     0     1
##   8     0     6    62   101    14   203
rm(df_top50PCs)
rm(sf_CD8T_tpm)
gc()
##             used  (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   2099182 112.2    5444354  290.8         NA    5444354   290.8
## Vcells 108023654 824.2 1088285418 8303.0      32768 2125549550 16216.7
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggplot2_3.3.1     umap_0.2.7.0      R.utils_2.9.2     R.oo_1.23.0      
## [5] R.methodsS3_1.8.0 Rtsne_0.15        data.table_1.12.8
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.3       RSpectra_0.16-0  cellranger_1.1.0 compiler_3.6.2  
##  [5] pillar_1.4.3     tools_3.6.2      digest_0.6.23    jsonlite_1.6.1  
##  [9] evaluate_0.14    lifecycle_0.2.0  tibble_3.0.1     gtable_0.3.0    
## [13] lattice_0.20-38  pkgconfig_2.0.3  rlang_0.4.6      Matrix_1.2-18   
## [17] yaml_2.2.1       xfun_0.12        withr_2.1.2      dplyr_0.8.4     
## [21] stringr_1.4.0    knitr_1.28       vctrs_0.3.0      askpass_1.1     
## [25] tidyselect_1.0.0 grid_3.6.2       reticulate_1.15  glue_1.3.1      
## [29] R6_2.4.1         readxl_1.3.1     rmarkdown_2.1    farver_2.0.3    
## [33] purrr_0.3.3      magrittr_1.5     scales_1.1.0     htmltools_0.4.0 
## [37] ellipsis_0.3.0   assertthat_0.2.1 colorspace_1.4-1 labeling_0.3    
## [41] stringi_1.4.5    openssl_1.4.1    munsell_0.5.0    crayon_1.3.4